knitr::opts_chunk$set(echo = TRUE)
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.4
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5
## ✔ tidyr 0.8.1 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Bring in the data!
attraw <- read_excel("~/Desktop/SMU Classes/MSDS 6306/Case Study II/CaseStudy2_2/CaseStudy2-data.xlsx")
View(attraw)
summary(attraw) # No NAs? 1470 lines of data No one under age 18 employee numbers go to 2068 but only 1470 lines. Job satisfaction mean 2.7 median 3.0
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1470 Length:1470 Min. : 102.0
## 1st Qu.:30.00 Class :character Class :character 1st Qu.: 465.0
## Median :36.00 Mode :character Mode :character Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1470 Min. : 1.000 Min. :1.000 Length:1470
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1.0 Min. :1.000 Length:1470
## 1st Qu.:1 1st Qu.: 491.2 1st Qu.:2.000 Class :character
## Median :1 Median :1020.5 Median :3.000 Mode :character
## Mean :1 Mean :1024.9 Mean :2.722
## 3rd Qu.:1 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :1 Max. :2068.0 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1470
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.89 Mean :2.73 Mean :2.064
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1470 Min. : 1009 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 2911 1st Qu.: 8047
## Median :3.000 Mode :character Median : 4919 Median :14236
## Mean :2.729 Mean : 6503 Mean :14313
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462
## Max. :4.000 Max. :19999 Max. :26999
## NumCompaniesWorked Over18 OverTime
## Min. :0.000 Length:1470 Length:1470
## 1st Qu.:1.000 Class :character Class :character
## Median :2.000 Mode :character Mode :character
## Mean :2.693
## 3rd Qu.:4.000
## Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## Min. :11.00 Min. :3.000 Min. :1.000
## 1st Qu.:12.00 1st Qu.:3.000 1st Qu.:2.000
## Median :14.00 Median :3.000 Median :3.000
## Mean :15.21 Mean :3.154 Mean :2.712
## 3rd Qu.:18.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :25.00 Max. :4.000 Max. :4.000
## StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## Min. :80 Min. :0.0000 Min. : 0.00 Min. :0.000
## 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000
## Median :80 Median :1.0000 Median :10.00 Median :3.000
## Mean :80 Mean :0.7939 Mean :11.28 Mean :2.799
## 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000
## Max. :80 Max. :3.0000 Max. :40.00 Max. :6.000
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :4.000 Max. :40.000 Max. :18.000
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
Graphically look at the data
colnames(attraw)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EmployeeCount" "EmployeeNumber"
## [11] "EnvironmentSatisfaction" "Gender"
## [13] "HourlyRate" "JobInvolvement"
## [15] "JobLevel" "JobRole"
## [17] "JobSatisfaction" "MaritalStatus"
## [19] "MonthlyIncome" "MonthlyRate"
## [21] "NumCompaniesWorked" "Over18"
## [23] "OverTime" "PercentSalaryHike"
## [25] "PerformanceRating" "RelationshipSatisfaction"
## [27] "StandardHours" "StockOptionLevel"
## [29] "TotalWorkingYears" "TrainingTimesLastYear"
## [31] "WorkLifeBalance" "YearsAtCompany"
## [33] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [35] "YearsWithCurrManager"
ggplot(data = attraw) + geom_bar(mapping = aes(x = Age))
ggplot(data = attraw) + geom_bar(mapping = aes(x = Attrition))
ggplot(data = attraw) + geom_bar(mapping = aes(x = BusinessTravel))
ggplot(data = attraw) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attraw) + geom_bar(mapping = aes(x = Department))
ggplot(data = attraw) + geom_bar(mapping = aes(x = DistanceFromHome))
ggplot(data = attraw) + geom_bar(mapping = aes(x = Education))
ggplot(data = attraw) + geom_bar(mapping = aes(x = EducationField))
ggplot(data = attraw) + geom_bar(mapping = aes(x = EmployeeCount))
ggplot(data = attraw) + geom_bar(mapping = aes(x = EmployeeNumber))
ggplot(data = attraw) + geom_bar(mapping = aes(x = EnvironmentSatisfaction))
ggplot(data = attraw) + geom_bar(mapping = aes(x = Gender))
ggplot(data = attraw) + geom_bar(mapping = aes(x = HourlyRate))
ggplot(data = attraw) + geom_bar(mapping = aes(x = JobInvolvement))
ggplot(data = attraw) + geom_bar(mapping = aes(x = JobLevel))
ggplot(data = attraw) + geom_bar(mapping = aes(x = JobRole))
ggplot(data = attraw) + geom_bar(mapping = aes(x = JobSatisfaction))
ggplot(data = attraw) + geom_bar(mapping = aes(x = MaritalStatus))
ggplot(data = attraw) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attraw) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attraw) + geom_bar(mapping = aes(x = NumCompaniesWorked))
ggplot(data = attraw) + geom_bar(mapping = aes(x = Over18))
ggplot(data = attraw) + geom_bar(mapping = aes(x = OverTime))
ggplot(data = attraw) + geom_bar(mapping = aes(x = PercentSalaryHike))
ggplot(data = attraw) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attraw) + geom_bar(mapping = aes(x = RelationshipSatisfaction))
ggplot(data = attraw) + geom_bar(mapping = aes(x = StandardHours))
ggplot(data = attraw) + geom_bar(mapping = aes(x = StockOptionLevel))
ggplot(data = attraw) + geom_bar(mapping = aes(x = TotalWorkingYears))
ggplot(data = attraw) + geom_bar(mapping = aes(x = TrainingTimesLastYear))
ggplot(data = attraw) + geom_bar(mapping = aes(x = WorkLifeBalance))
ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsAtCompany))
ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsInCurrentRole))
ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))
ggplot(data = attraw) + geom_bar(mapping = aes(x = YearsWithCurrManager))
Graphically look at the data more
ggplot(data = attraw) + geom_point(mapping = aes(x = EmployeeNumber, y = YearsAtCompany))
attryes <- subset(attraw, Attrition =="Yes", select=Age:YearsWithCurrManager)
attrno <- subset(attraw, Attrition =="No", select=Age:YearsWithCurrManager)
summary(attryes)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:237 Length:237 Min. : 103.0
## 1st Qu.:28.00 Class :character Class :character 1st Qu.: 408.0
## Median :32.00 Mode :character Mode :character Median : 699.0
## Mean :33.61 Mean : 750.4
## 3rd Qu.:39.00 3rd Qu.:1092.0
## Max. :58.00 Max. :1496.0
## Department DistanceFromHome Education EducationField
## Length:237 Min. : 1.00 Min. :1.00 Length:237
## Class :character 1st Qu.: 3.00 1st Qu.:2.00 Class :character
## Mode :character Median : 9.00 Median :3.00 Mode :character
## Mean :10.63 Mean :2.84
## 3rd Qu.:17.00 3rd Qu.:4.00
## Max. :29.00 Max. :5.00
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1 Min. :1.000 Length:237
## 1st Qu.:1 1st Qu.: 514 1st Qu.:1.000 Class :character
## Median :1 Median :1017 Median :3.000 Mode :character
## Mean :1 Mean :1010 Mean :2.464
## 3rd Qu.:1 3rd Qu.:1486 3rd Qu.:4.000
## Max. :1 Max. :2055 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 31.00 Min. :1.000 Min. :1.000 Length:237
## 1st Qu.: 50.00 1st Qu.:2.000 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.000 Median :1.000 Mode :character
## Mean : 65.57 Mean :2.519 Mean :1.637
## 3rd Qu.: 84.00 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :100.00 Max. :4.000 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:237 Min. : 1009 Min. : 2326
## 1st Qu.:1.000 Class :character 1st Qu.: 2373 1st Qu.: 8870
## Median :3.000 Mode :character Median : 3202 Median :14618
## Mean :2.468 Mean : 4787 Mean :14559
## 3rd Qu.:3.000 3rd Qu.: 5916 3rd Qu.:21081
## Max. :4.000 Max. :19859 Max. :26999
## NumCompaniesWorked Over18 OverTime
## Min. :0.000 Length:237 Length:237
## 1st Qu.:1.000 Class :character Class :character
## Median :1.000 Mode :character Mode :character
## Mean :2.941
## 3rd Qu.:5.000
## Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## Min. :11.0 Min. :3.000 Min. :1.000
## 1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000
## Median :14.0 Median :3.000 Median :3.000
## Mean :15.1 Mean :3.156 Mean :2.599
## 3rd Qu.:17.0 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :25.0 Max. :4.000 Max. :4.000
## StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## Min. :80 Min. :0.0000 Min. : 0.000 Min. :0.000
## 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 3.000 1st Qu.:2.000
## Median :80 Median :0.0000 Median : 7.000 Median :2.000
## Mean :80 Mean :0.5274 Mean : 8.245 Mean :2.624
## 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:10.000 3rd Qu.:3.000
## Max. :80 Max. :3.0000 Max. :40.000 Max. :6.000
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 0.000
## Median :3.000 Median : 3.000 Median : 2.000
## Mean :2.658 Mean : 5.131 Mean : 2.903
## 3rd Qu.:3.000 3rd Qu.: 7.000 3rd Qu.: 4.000
## Max. :4.000 Max. :40.000 Max. :15.000
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.000 Median : 2.000
## Mean : 1.945 Mean : 2.852
## 3rd Qu.: 2.000 3rd Qu.: 5.000
## Max. :15.000 Max. :14.000
summary(attrno)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1233 Length:1233 Min. : 102.0
## 1st Qu.:31.00 Class :character Class :character 1st Qu.: 477.0
## Median :36.00 Mode :character Mode :character Median : 817.0
## Mean :37.56 Mean : 812.5
## 3rd Qu.:43.00 3rd Qu.:1176.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1233 Min. : 1.000 Min. :1.000 Length:1233
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 8.916 Mean :2.927
## 3rd Qu.:13.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 2 Min. :1.000 Length:1233
## 1st Qu.:1 1st Qu.: 483 1st Qu.:2.000 Class :character
## Median :1 Median :1022 Median :3.000 Mode :character
## Mean :1 Mean :1028 Mean :2.771
## 3rd Qu.:1 3rd Qu.:1574 3rd Qu.:4.000
## Max. :1 Max. :2068 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1233
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.95 Mean :2.77 Mean :2.146
## 3rd Qu.: 83.00 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1233 Min. : 1051 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 3211 1st Qu.: 7973
## Median :3.000 Mode :character Median : 5204 Median :14120
## Mean :2.779 Mean : 6833 Mean :14266
## 3rd Qu.:4.000 3rd Qu.: 8834 3rd Qu.:20364
## Max. :4.000 Max. :19999 Max. :26997
## NumCompaniesWorked Over18 OverTime
## Min. :0.000 Length:1233 Length:1233
## 1st Qu.:1.000 Class :character Class :character
## Median :2.000 Mode :character Mode :character
## Mean :2.646
## 3rd Qu.:4.000
## Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## Min. :11.00 Min. :3.000 Min. :1.000
## 1st Qu.:12.00 1st Qu.:3.000 1st Qu.:2.000
## Median :14.00 Median :3.000 Median :3.000
## Mean :15.23 Mean :3.153 Mean :2.734
## 3rd Qu.:18.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :25.00 Max. :4.000 Max. :4.000
## StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## Min. :80 Min. :0.0000 Min. : 0.00 Min. :0.000
## 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000
## Median :80 Median :1.0000 Median :10.00 Median :3.000
## Mean :80 Mean :0.8451 Mean :11.86 Mean :2.833
## 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:16.00 3rd Qu.:3.000
## Max. :80 Max. :3.0000 Max. :38.00 Max. :6.000
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median : 6.000 Median : 3.000
## Mean :2.781 Mean : 7.369 Mean : 4.484
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :4.000 Max. :37.000 Max. :18.000
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.234 Mean : 4.367
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
Mndat <- (colMeans(attraw[ , c("Age", "DailyRate", "DistanceFromHome", "Education" , "EnvironmentSatisfaction" , "HourlyRate", "JobInvolvement" , "JobLevel" , "JobSatisfaction" , "MonthlyIncome", "MonthlyRate" , "NumCompaniesWorked", "PercentSalaryHike" , "PerformanceRating" , "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears" , "TrainingTimesLastYear" , "WorkLifeBalance" , "YearsAtCompany" , "YearsInCurrentRole" , "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))
Mnyes <- (colMeans(attryes[ , c("Age", "DailyRate", "DistanceFromHome", "Education" , "EnvironmentSatisfaction" , "HourlyRate", "JobInvolvement" , "JobLevel" , "JobSatisfaction" , "MonthlyIncome", "MonthlyRate" , "NumCompaniesWorked", "PercentSalaryHike" , "PerformanceRating" , "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears" , "TrainingTimesLastYear" , "WorkLifeBalance" , "YearsAtCompany" , "YearsInCurrentRole" , "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))
Mnno <- (colMeans(attrno[ , c("Age", "DailyRate", "DistanceFromHome", "Education" , "EnvironmentSatisfaction" , "HourlyRate", "JobInvolvement" , "JobLevel" , "JobSatisfaction" , "MonthlyIncome", "MonthlyRate" , "NumCompaniesWorked", "PercentSalaryHike" , "PerformanceRating" , "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears" , "TrainingTimesLastYear" , "WorkLifeBalance" , "YearsAtCompany" , "YearsInCurrentRole" , "YearsSinceLastPromotion" , "YearsWithCurrManager" )]))
meandf <- data_frame(Mndat, Mnyes, Mnno)
tmeandf <- t(meandf)
# Compare means of columns of the raw data vs noattrition vs yesattrition Items of note... Variables that seem to have little value. Employee number(doesnt seem to correlate to longer time with the company), Employee count(all =1), Standard hours(all = 80), Over 18 (all=Y). Signifigant variables needing more insight. Age mean for all=36.9 yes=33.6 Mean for no= 37.5 Median and mean close for all sets. DailyRate mean for all= 802 yes=750.36 Mean for no= 812 Median is lower by 50 in Yesattrition. Job level mean for all= 2.06 yes=1.63 Mean for no= 2.14 median for Yes is 1 much lower than others. Monthly Income mean for all=6502.93 yes=4787.93 Mean for no= 6832.74 about 1500 diffrence between all medians and means. Stock Optionlevel mean for all=.793 yes=527 Mean for no= .845 Median for all and No = 1 Yes is much lower with a median of 0. Total working years mean for all=11.27 yes=8.24 Mean for no= 11.86 Median and mean close for all sets. Years at company mean for all=7.00 yes=5.13 Mean for no= 7.36 Big jump of 40-60% from Median to Mean in all data sets. Years in current role mean for all=4.22 yes=2.90 Mean for no= 4.48 Jump of about 50% in all data sets. YearsSinceLast Promo mean for all= 2.18 yes=1.94 Mean for no= 2.23 Big jumps of 94 to 118% in all data sets. Years with curr Manager mean for all=4.12 yes=2.85 Mean for no= 4.36 Jumps about 40% between median to mean.Still need to look closer at categorical data.
Attrition only
ggplot(data = attryes) + geom_bar(mapping = aes(x = Age))
ggplot(data = attryes) + geom_bar(mapping = aes(x = Attrition))
ggplot(data = attryes) + geom_bar(mapping = aes(x = BusinessTravel))
ggplot(data = attryes) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attryes) + geom_bar(mapping = aes(x = Department))
ggplot(data = attryes) + geom_bar(mapping = aes(x = DistanceFromHome))
ggplot(data = attryes) + geom_bar(mapping = aes(x = Education))
ggplot(data = attryes) + geom_bar(mapping = aes(x = EducationField))
ggplot(data = attryes) + geom_bar(mapping = aes(x = EmployeeCount))
ggplot(data = attryes) + geom_bar(mapping = aes(x = EmployeeNumber))
ggplot(data = attryes) + geom_bar(mapping = aes(x = EnvironmentSatisfaction))
ggplot(data = attryes) + geom_bar(mapping = aes(x = Gender))
ggplot(data = attryes) + geom_bar(mapping = aes(x = HourlyRate))
ggplot(data = attryes) + geom_bar(mapping = aes(x = JobInvolvement))
ggplot(data = attryes) + geom_bar(mapping = aes(x = JobLevel))
ggplot(data = attryes) + geom_bar(mapping = aes(x = JobRole))
ggplot(data = attryes) + geom_bar(mapping = aes(x = JobSatisfaction))
ggplot(data = attryes) + geom_bar(mapping = aes(x = MaritalStatus))
ggplot(data = attryes) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attryes) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attryes) + geom_bar(mapping = aes(x = NumCompaniesWorked))
ggplot(data = attryes) + geom_bar(mapping = aes(x = Over18))
ggplot(data = attryes) + geom_bar(mapping = aes(x = OverTime))
ggplot(data = attryes) + geom_bar(mapping = aes(x = PercentSalaryHike))
ggplot(data = attryes) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attryes) + geom_bar(mapping = aes(x = RelationshipSatisfaction))
ggplot(data = attryes) + geom_bar(mapping = aes(x = StandardHours), binwidth = 1)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attryes) + geom_bar(mapping = aes(x = StockOptionLevel))
ggplot(data = attryes) + geom_bar(mapping = aes(x = TotalWorkingYears))
ggplot(data = attryes) + geom_bar(mapping = aes(x = TrainingTimesLastYear))
ggplot(data = attryes) + geom_bar(mapping = aes(x = WorkLifeBalance))
ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsAtCompany))
ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsInCurrentRole))
ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))
ggplot(data = attryes) + geom_bar(mapping = aes(x = YearsWithCurrManager))
No attrition
ggplot(data = attrno) + geom_bar(mapping = aes(x = Age))
ggplot(data = attrno) + geom_bar(mapping = aes(x = Attrition))
ggplot(data = attrno) + geom_bar(mapping = aes(x = BusinessTravel))
ggplot(data = attrno) + geom_bar(mapping = aes(x = DailyRate), binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attrno) + geom_bar(mapping = aes(x = Department))
ggplot(data = attrno) + geom_bar(mapping = aes(x = DistanceFromHome))
ggplot(data = attrno) + geom_bar(mapping = aes(x = Education))
ggplot(data = attrno) + geom_bar(mapping = aes(x = EducationField))
ggplot(data = attrno) + geom_bar(mapping = aes(x = EmployeeCount))
ggplot(data = attrno) + geom_bar(mapping = aes(x = EmployeeNumber))
ggplot(data = attrno) + geom_bar(mapping = aes(x = EnvironmentSatisfaction))
ggplot(data = attrno) + geom_bar(mapping = aes(x = Gender))
ggplot(data = attrno) + geom_bar(mapping = aes(x = HourlyRate))
ggplot(data = attrno) + geom_bar(mapping = aes(x = JobInvolvement))
ggplot(data = attrno) + geom_bar(mapping = aes(x = JobLevel))
ggplot(data = attrno) + geom_bar(mapping = aes(x = JobRole))
ggplot(data = attrno) + geom_bar(mapping = aes(x = JobSatisfaction))
ggplot(data = attrno) + geom_bar(mapping = aes(x = MaritalStatus))
ggplot(data = attrno) + geom_bar(mapping = aes(x = MonthlyIncome), binwidth = 5000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attrno) + geom_bar(mapping = aes(x = MonthlyRate), binwidth = 10000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attrno) + geom_bar(mapping = aes(x = NumCompaniesWorked))
ggplot(data = attrno) + geom_bar(mapping = aes(x = Over18))
ggplot(data = attrno) + geom_bar(mapping = aes(x = OverTime))
ggplot(data = attrno) + geom_bar(mapping = aes(x = PercentSalaryHike))
ggplot(data = attrno) + geom_bar(mapping = aes(x = PerformanceRating), binwidth = .5)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
ggplot(data = attrno) + geom_bar(mapping = aes(x = RelationshipSatisfaction))
ggplot(data = attrno) + geom_bar(mapping = aes(x = StandardHours))
ggplot(data = attrno) + geom_bar(mapping = aes(x = StockOptionLevel))
ggplot(data = attrno) + geom_bar(mapping = aes(x = TotalWorkingYears))
ggplot(data = attrno) + geom_bar(mapping = aes(x = TrainingTimesLastYear))
ggplot(data = attrno) + geom_bar(mapping = aes(x = WorkLifeBalance))
ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsAtCompany))
ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsInCurrentRole))
ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsSinceLastPromotion))
ggplot(data = attrno) + geom_bar(mapping = aes(x = YearsWithCurrManager))
ggplot(data = attraw, mapping = aes(x = Age, y = MonthlyIncome)) + geom_point()
ggplot(data = attraw, mapping = aes(x = DailyRate, y = DistanceFromHome )) + geom_point()
ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = HourlyRate )) + geom_point()
ggplot(data = attraw, mapping = aes(x = NumCompaniesWorked, y = MonthlyIncome )) + geom_point()
ggplot(data = attraw, mapping = aes(x = TotalWorkingYears, y = PercentSalaryHike)) + geom_point()
ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = MonthlyIncome )) + geom_point()
ggplot(data = attraw, mapping = aes(x = YearsAtCompany, y = YearsSinceLastPromotion )) + geom_point()
ggplot(data = attraw, mapping = aes(x = DailyRate, y = MonthlyRate)) + geom_point()
ggplot(data = attraw, mapping = aes(x = Age, y = YearsSinceLastPromotion)) + geom_point()
ggplot(data = attraw, mapping = aes(x = StockOptionLevel, y = YearsSinceLastPromotion)) + geom_point()
ggplot(data = attraw, mapping = aes(x = Age, y = TotalWorkingYears)) + geom_point()
ggplot(data = attraw, mapping = aes(x = YearsInCurrentRole, y = MonthlyIncome)) + geom_point()
ggplot(data = attraw, mapping = aes(x = YearsSinceLastPromotion, y = YearsWithCurrManager)) + geom_point()
Multiple regression
# just for kicks
attrnewy <- attryes
attrnewn <- attrno
attrnewr <- attraw
attcoly <- ifelse(attrnewy$Attrition == "Yes", "1", NA)
attcoln <- ifelse(attrnewn$Attrition == "No", "1", NA)
attcolr<- ifelse(attrnewr$Attrition == "Yes", "1", "0")
attrnewy$Attrition <- attcoly
attrnewn$Attrition <- attcoln
attrnewr$Attrition <- attcolr
lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewy )
##
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome +
## Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement +
## JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate +
## NumCompaniesWorked + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears +
## TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
## data = attrnewy)
##
## Coefficients:
## (Intercept) Age
## 1.000e+00 2.858e-17
## DailyRate DistanceFromHome
## 1.870e-19 -2.012e-17
## Education EnvironmentSatisfaction
## -2.364e-16 -2.360e-17
## HourlyRate JobInvolvement
## 1.556e-17 2.657e-16
## JobLevel JobSatisfaction
## 2.083e-16 3.037e-16
## MonthlyIncome MonthlyRate
## -4.951e-21 2.335e-20
## NumCompaniesWorked PercentSalaryHike
## 2.489e-16 -1.179e-16
## PerformanceRating RelationshipSatisfaction
## 8.658e-16 -4.273e-16
## StockOptionLevel TotalWorkingYears
## -1.996e-16 -1.253e-16
## TrainingTimesLastYear WorkLifeBalance
## -4.047e-16 -5.555e-16
## YearsAtCompany YearsInCurrentRole
## 8.261e-17 3.067e-17
## YearsSinceLastPromotion YearsWithCurrManager
## -1.783e-16 1.370e-16
lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewn )
##
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome +
## Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement +
## JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate +
## NumCompaniesWorked + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears +
## TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
## data = attrnewn)
##
## Coefficients:
## (Intercept) Age
## 1.000e+00 1.320e-16
## DailyRate DistanceFromHome
## -1.064e-18 -7.623e-18
## Education EnvironmentSatisfaction
## -6.703e-16 4.264e-17
## HourlyRate JobInvolvement
## -2.611e-18 -5.119e-16
## JobLevel JobSatisfaction
## 5.044e-16 -2.218e-16
## MonthlyIncome MonthlyRate
## -1.121e-19 6.625e-20
## NumCompaniesWorked PercentSalaryHike
## -7.704e-17 8.983e-17
## PerformanceRating RelationshipSatisfaction
## 1.483e-15 4.010e-16
## StockOptionLevel TotalWorkingYears
## 1.077e-16 -1.272e-16
## TrainingTimesLastYear WorkLifeBalance
## 1.435e-18 1.197e-16
## YearsAtCompany YearsInCurrentRole
## 2.890e-17 8.767e-17
## YearsSinceLastPromotion YearsWithCurrManager
## -1.480e-16 8.878e-17
lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome+ Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement + JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate + NumCompaniesWorked + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data = attrnewr )
##
## Call:
## lm(formula = Attrition ~ Age + DailyRate + DistanceFromHome +
## Education + EnvironmentSatisfaction + HourlyRate + JobInvolvement +
## JobLevel + JobSatisfaction + MonthlyIncome + MonthlyRate +
## NumCompaniesWorked + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + StockOptionLevel + TotalWorkingYears +
## TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
## data = attrnewr)
##
## Coefficients:
## (Intercept) Age
## 9.322e-01 -3.857e-03
## DailyRate DistanceFromHome
## -3.351e-05 3.859e-03
## Education EnvironmentSatisfaction
## -9.891e-04 -3.510e-02
## HourlyRate JobInvolvement
## -2.341e-04 -6.199e-02
## JobLevel JobSatisfaction
## -2.366e-02 -3.470e-02
## MonthlyIncome MonthlyRate
## -1.226e-06 6.207e-07
## NumCompaniesWorked PercentSalaryHike
## 1.430e-02 -4.303e-03
## PerformanceRating RelationshipSatisfaction
## 3.053e-02 -1.751e-02
## StockOptionLevel TotalWorkingYears
## -5.513e-02 -3.103e-03
## TrainingTimesLastYear WorkLifeBalance
## -1.661e-02 -2.893e-02
## YearsAtCompany YearsInCurrentRole
## 5.986e-03 -1.099e-02
## YearsSinceLastPromotion YearsWithCurrManager
## 1.167e-02 -1.193e-02
# Need to narrow down variables to check correlation